!pip install pandas-profiling
import pandas_profiling
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error as MAE
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
train = pd.read_csv('../input/train.csv').drop('Id', axis=1)
test = pd.read_csv('../input/test.csv').drop('Id', axis=1)
print(f'Train: {train.shape}')
print(f'Test: {test.shape}')
train.profile_report(style={'full_width':True})
df_corr = train.corr()['SalePrice'].drop('SalePrice')
ax = df_corr.sort_values(ascending=True).plot(kind='barh',
title='Correlation between Features and Sale Price',
figsize=(16,8))
plt.xlabel('Correlation'); plt.ylabel('Features');
X = train.copy()
y = X.pop('SalePrice')
cat = X.describe(include='O').columns
num = X.describe().columns
def fit_predict(model, i=10):
l = []
for _ in range(i):
train_X, test_X, train_y, test_y = train_test_split(X.fillna(0)[num],y)
model.fit(train_X, train_y)
yhat = model.predict(test_X)
mae = int(MAE(yhat, test_y))
l.append(mae)
mean = int(np.mean(l))
print('Iter:', *l)
print(f'Mean: {mean}')
name = str(lr.__class__)[16:-2]
pd.DataFrame(l).plot(kind='line', figsize=(10,2), title=str(name))
lr = LinearRegression()
fit_predict(lr)
rt = RandomForestRegressor(random_state=1)
fit_predict(rt)
gbr = GradientBoostingRegressor(max_depth=5)
fit_predict(gbr)
def fillna(df):
fillna_dict = {
'Electrical': df['Electrical'].mode()[0],
'FireplaceQu': 'NA',
'GarageType': 'NA',
'GarageYrBlt': df['GarageYrBlt'].mode()[0], #change
'GarageFinish': 'Unf',
'GarageCars': 0,
'GarageArea': 0,
'GarageQual': 'NA',
'GarageCond': 'NA',
'PoolQC': 'NA',
'Fence': 'NA',
'MiscFeature': 'NA',
'BsmtQual': 'NA',
'BsmtCond': 'NA',
'BsmtExposure': 'NA',
'BsmtFinType1': 'NA',
'BsmtFinType2': 'NA',
'MasVnrType': 'None',
'Alley': 'NA',
'MasVnrArea': 0.0,
'LotFrontage': df['LotFrontage'].min()
}
for k,v in fillna_dict.items():
df[k].fillna(v, inplace=True)
return df
def remap_col(remap, to_remap):
global df
for col in to_remap:
df[col] = df[col].replace(remap)
df = train.copy()
to_remap_qal = ['ExterQual', 'KitchenQual', 'BsmtQual', 'HeatingQC', 'BsmtCond', 'PoolQC']
remap_qal = {
'NA': 0, # NA
'Po': 1, # poor
'TA': 2, # typical
'Fa': 3, # fair
'Gd': 4, # good
'Ex': 5, # excellent
}
to_remap_YN = ['CentralAir']
remap_YN = {
'N': 0,
'Y': 1
}
remap_col(remap_qal, to_remap_qal)
remap_col(remap_YN, to_remap_YN)
X = df[df.describe().columns]
y = X.pop('SalePrice')
gbr = GradientBoostingRegressor(max_depth=5)
fit_predict(gbr)
one_hot_encode = {}
for col in cat:
try:
OHE = pd.get_dummies(df[col])
temp = pd.concat([OHE, y], axis=1)
temp = abs(temp.corr()['SalePrice']).mean()
if temp > .2:
one_hot_encode[col] = temp
except: pass
one_hot_encode = sorted(one_hot_encode.items(), key=lambda x: x[1], reverse=True)
OHE_cols = [k for k,v in one_hot_encode]
print(OHE_cols)
df1 = pd.get_dummies(df, columns=OHE_cols)
X = df1.select_dtypes(include=[float, int])
y = X.pop('SalePrice')
gbr = GradientBoostingRegressor(max_depth=5)
fit_predict(gbr)